packages <- c(
  "CIMseq", "CIMseq.data", "printr", "ggthemes", "tidyverse"
)
purrr::walk(packages, library, character.only = TRUE)
rm(packages)

##DATA
load('../data/CIMseqData.rda')
load('../data/sObj.rda')
#rename classes
renameClasses <- function(class) {
  case_when(
    class == "0" ~ "C.Lgr5.proximal.1",
    class == "1" ~ "SI.Stem",
    class == "2" ~ "C.TA.proximal.1",
    class == "3" ~ "C.Lgr5.proximal.2",
    class == "4" ~ "C.Lgr5.distal",
    class == "5" ~ "SI.Lgr5.Mki67",
    class == "6" ~ "C.Goblet.distal.1",
    class == "7" ~ "SI.TA.1",
    class == "8" ~ "C.TA.proximal.2",
    class == "9" ~ "C.Goblet.distal.2",
    class == "10" ~ "SI.TA.2",
    class == "11" ~ "C.Goblet.proximal.1",
    class == "12" ~ "SI.Goblet",
    class == "13" ~ "SI.Enterocytes",
    class == "14" ~ "C.Colonocytes.distal",
    class == "15" ~ "C.Goblet.distal.3",
    class == "16" ~ "C.Lgr5.Mki67.proximal.1",
    class == "17" ~ "C.TA.distal",
    class == "18" ~ "C.Lgr5.Mki67.proximal.2",
    class == "19" ~ "C.Colonocyte.proximal.1",
    class == "20" ~ "Tufft",
    class == "21" ~ "C.Colonocyte.proximal.2",
    class == "22" ~ "C.Goblet.proximal.2",
    class == "23" ~ "Enteroendocrine",
    class == "24" ~ "C.Goblet.distal.4",
    class == "25" ~ "SI.Paneth",
    TRUE ~ "error"
  )
}

c.order <- c(
  "SI.Goblet", "SI.Paneth", "SI.Stem", "SI.Lgr5.Mki67", "SI.TA.1", "SI.TA.2", "SI.Enterocytes",
  "C.Goblet.proximal.1", "C.Goblet.proximal.2", "C.Lgr5.proximal.1", "C.Lgr5.proximal.2", "C.Lgr5.Mki67.proximal.1", "C.Lgr5.Mki67.proximal.2", "C.TA.proximal.1", "C.TA.proximal.2", "C.Colonocyte.proximal.1", "C.Colonocyte.proximal.2",
  "C.Goblet.distal.1", "C.Goblet.distal.2", "C.Goblet.distal.3", "C.Goblet.distal.4", "C.Lgr5.distal", "C.TA.distal", "C.Colonocytes.distal",
  "Tufft", "Enteroendocrine"
)

getData(cObjSng, "classification") <- renameClasses(getData(cObjSng, "classification"))
fractions <- getData(sObj, "fractions")
colnames(fractions) <- renameClasses(colnames(fractions))
sObj@fractions <- fractions

Fig 1: Classes

plotUnsupervisedClass(cObjSng, cObjMul)
## Warning in .checkEstimateCellsInput(counts.ercc): Results will not be
## accurate. These samples ERCC reads are all 0's: s.SRR5103562, s.SRR5102813,
## s.SRR5102362, s.SRR5102747, s.SRR5103063, ...

Fig 2: Cell type gene expression

plotUnsupervisedMarkers(
  cObjSng, cObjMul,
  c("Lgr5", "Ptprc", "Chga", "Dclk1", "Slc26a3", "Atoh1", "Alpi", "Lyz1"),
  pal = RColorBrewer::brewer.pal(8, "Set1")
)
## Warning in .checkEstimateCellsInput(counts.ercc): Results will not be
## accurate. These samples ERCC reads are all 0's: s.SRR5103562, s.SRR5102813,
## s.SRR5102362, s.SRR5102747, s.SRR5103063, ...

Fig 3: Cell cycle and architecture marker

plotUnsupervisedMarkers(
  cObjSng, cObjMul, c("Mki67"),
  pal = RColorBrewer::brewer.pal(8, "Set1")
)
## Warning in .checkEstimateCellsInput(counts.ercc): Results will not be
## accurate. These samples ERCC reads are all 0's: s.SRR5103562, s.SRR5102813,
## s.SRR5102362, s.SRR5102747, s.SRR5103063, ...

plotUnsupervisedMarkers(
  cObjSng, cObjMul, c("Plet1"),
  pal = RColorBrewer::brewer.pal(8, "Set1")
)
## Warning in .checkEstimateCellsInput(counts.ercc): Results will not be
## accurate. These samples ERCC reads are all 0's: s.SRR5103562, s.SRR5102813,
## s.SRR5102362, s.SRR5102747, s.SRR5103063, ...

plotUnsupervisedMarkers(
  cObjSng, cObjMul, c("Hoxb13"),
  pal = RColorBrewer::brewer.pal(8, "Set1")
)
## Warning in .checkEstimateCellsInput(counts.ercc): Results will not be
## accurate. These samples ERCC reads are all 0's: s.SRR5103562, s.SRR5102813,
## s.SRR5102362, s.SRR5102747, s.SRR5103063, ...

Fig 4: Plates

plotUnsupervisedClass(cObjSng, cObjMul) %>%
  plotData() %>%
  inner_join(MGA.Meta, by = c("Sample" = "sample")) %>%
  ggplot() +
  geom_point(aes(`dim.red dim 1`, `dim.red dim 2`, colour = unique_key)) +
  ggthemes::theme_few() +
  scale_colour_manual(values = col40())
## Warning in .checkEstimateCellsInput(counts.ercc): Results will not be
## accurate. These samples ERCC reads are all 0's: s.SRR5103562, s.SRR5102813,
## s.SRR5102362, s.SRR5102747, s.SRR5103063, ...

Fig 5: Mice

plotUnsupervisedClass(cObjSng, cObjMul) %>%
  plotData() %>%
  inner_join(MGA.Meta, by = c("Sample" = "sample")) %>%
  mutate(subject_id = as.character(subject_id)) %>%
  ggplot() +
  geom_point(aes(`dim.red dim 1`, `dim.red dim 2`, colour = subject_id), alpha = 0.3) +
  ggthemes::theme_few() +
  scale_colour_manual(values = col40())
## Warning in .checkEstimateCellsInput(counts.ercc): Results will not be
## accurate. These samples ERCC reads are all 0's: s.SRR5103562, s.SRR5102813,
## s.SRR5102362, s.SRR5102747, s.SRR5103063, ...

Fig 6: Age

plotUnsupervisedClass(cObjSng, cObjMul) %>%
  plotData() %>%
  inner_join(MGA.Meta, by = c("Sample" = "sample")) %>%
  mutate(subject_age = as.character(subject_age)) %>%
  ggplot() +
  geom_point(aes(`dim.red dim 1`, `dim.red dim 2`, colour = subject_age)) +
  ggthemes::theme_few() +
  scale_colour_manual(values = col40())
## Warning in .checkEstimateCellsInput(counts.ercc): Results will not be
## accurate. These samples ERCC reads are all 0's: s.SRR5103562, s.SRR5102813,
## s.SRR5102362, s.SRR5102747, s.SRR5103063, ...

Fig 7: Classes per source

tibble(
  sample = colnames(getData(cObjSng, "counts")),
  class = getData(cObjSng, "classification")
) %>%
  mutate(source = if_else(
    str_detect(sample, "SRR654") | str_detect(sample, "SRR510"), 
    "External", "Enge"
  )) %>%
  group_by(source, class) %>%
  summarize(n = n()) %>%
  ungroup() %>%
  group_by(source) %>%
  mutate(`%` = n / sum(n) * 100) %>%
  ggplot() +
  geom_bar(aes(source, `%`, fill = source), stat = "identity", position = position_dodge(width = 1)) +
  facet_wrap(~class, scales = "free") +
  labs(x = "Source", y = "% of dataset") +
  theme_bw() +
  theme(
    legend.position = "top",
    axis.title.x = element_blank()
  ) +
  guides(fill = FALSE)

Fig 8: Source overlay

getData(cObjSng, "dim.red") %>%
  matrix_to_tibble("sample") %>%
  setNames(c("sample", "UMAP.dim1", "UMAP.dim2")) %>%
  mutate(source = case_when(
    str_detect(sample, "SRR654") ~ "Tabula Muris",
    str_detect(sample, "SRR510") ~ "Regev",
    TRUE ~ "Enge"
  )) %>%
  sample_n(nrow(.), FALSE) %>%
  ggplot() +
  geom_point(aes(UMAP.dim1, UMAP.dim2, colour = source), alpha = 0.75) +
  theme_few()

Fig 9: Class mean correlation

averageGroupExpression <- function(exp, classes) {
  c <- unique(classes)
  means <- purrr::map_dfc(c, function(x) {
    data.frame(rowMeans(exp[, classes == x]))
  })
  means <- as.matrix(means)
  colnames(means) <- c
  return(means)
}

av.exp <- averageGroupExpression(
  getData(cObjSng, "counts.cpm")[getData(cObjMul, "features"), ], 
  getData(cObjSng, "classification")
)
p.cor <- cor(av.exp, method = "pearson")
p.cor %>%
  matrix_to_tibble("to") %>%
  gather(from, cor, -to) %>%
  mutate(
    from = parse_factor(from, levels = c.order),
    to = parse_factor(to, levels = c.order)
  ) %>%
  ggplot() +
  geom_tile(aes(from, to, fill = cor)) +
  scale_fill_viridis_c() +
  theme_few() +
  theme(
    axis.text.x = element_text(angle = 90),
    axis.title = element_blank()
  ) +
  guides(fill = guide_colorbar(title = "Pearson's\ncorrelation"))

Fig 10: Connections per multiplet

There are 2137 multiplets in the analysis.

adj <- adjustFractions(cObjSng.enge, cObjMul, sObj, theoretical.max = 4)
nConnections <- rowSums(adj)
table(nConnections) / length(nConnections) * 100
0 1 2 3 4 5
5.568554 31.44595 39.77539 17.45438 4.960225 0.7955077
tibble(
  sample = names(nConnections),
  connections = nConnections
) %>% 
  inner_join(MGA.Meta, by = "sample") %>%
  count(sub_tissue, connections)
sub_tissue connections n
colon 0 111
colon 1 556
colon 2 625
colon 3 297
colon 4 98
colon 5 16
small_intestine 0 8
small_intestine 1 116
small_intestine 2 225
small_intestine 3 76
small_intestine 4 8
small_intestine 5 1

Fig 11: Fraction histogram

fractions <- getData(sObj, "fractions")
tibble(fractions = c(fractions)) %>%
  ggplot() +
  geom_histogram(aes(fractions), binwidth = 0.01) +
  theme_bw()

Fig 12: Detected cell types vs. cost

costs <- getData(sObj, "costs")
tibble(
  nCellTypes = nConnections,
  cost = costs
) %>%
  ggplot() +
  geom_boxplot(aes(nCellTypes, cost, group = nCellTypes)) +
  scale_x_continuous(name = "Detected cell types", breaks = 0:max(nConnections)) +
  theme_bw()

Fig 13: Estimated cell numbers vs. cost

estimatedCells <- estimateCells(cObjSng.enge, cObjMul, theoretical.max = 4)
tibble(
  sample = names(costs),
  cost = unname(costs)
) %>%
  inner_join(
    select(estimatedCells, sample, estimatedCellNumber), 
    by = "sample"
  ) %>%
  mutate(estimatedCellNumber = round(estimatedCellNumber)) %>%
  ggplot() +
  geom_boxplot(aes(estimatedCellNumber, cost, group = estimatedCellNumber)) +
  scale_x_continuous(
    name = "ERCC estimated cell number"
    #breaks = 0:max(round(pull(estimateCells(cObjSng, cObjMul), estimatedCellNumber)))
  ) +
  theme_bw()

Fig 14: Estimated cell number vs. Detected cell number

tibble(
  sample = names(nConnections), 
  detectedConnections = nConnections
) %>% 
  inner_join(estimatedCells) %>% 
  mutate(estimatedCellNumber = round(estimatedCellNumber)) %>%
  ggplot() +
  geom_boxplot(aes(estimatedCellNumber, detectedConnections, group = estimatedCellNumber)) +
  scale_x_continuous(
    name = "ERCC estimated cell number", 
    breaks = 0:max(round(pull(filter(estimatedCells, sampleType == "Multiplet"), estimatedCellNumber)))
  ) +
  scale_y_continuous(
    name = "Detected cell number",
    breaks = 0:max(round(nConnections))
  ) +
  theme_bw()
## Joining, by = "sample"

Fig 15: Detected cell number vs. Total counts

tibble(
  sample = names(nConnections),
  detectedConnections = nConnections
) %>%
  inner_join(tibble(
    sample = colnames(getData(cObjMul, "counts")),
    total.counts = colSums(getData(cObjMul, "counts"))
  ), by = "sample") %>%
  ggplot() +
  geom_boxplot(aes(detectedConnections, total.counts, group = detectedConnections)) +
  scale_x_continuous(
    name = "Detected cell number", 
    breaks = 0:max(nConnections)
  ) +
  scale_y_continuous(name = "Total counts") +
  theme_bw()

Fig 16: Detected cell number vs. Total ERCC counts

tibble(
  sample = names(nConnections),
  detectedConnections = nConnections
) %>%
  inner_join(tibble(
    sample = colnames(getData(cObjMul, "counts")),
    total.ercc = colSums(getData(cObjMul, "counts.ercc"))
  ), by = "sample") %>%
  ggplot() +
  geom_boxplot(aes(detectedConnections, total.ercc, group = detectedConnections)) +
  scale_x_continuous(
    name = "Detected cell number", 
    breaks = 0:max(nConnections)
  ) +
  scale_y_continuous(name = "Total ERCC counts") +
  theme_bw()

Fig 17: Connections

plotSwarmCircos(sObj, cObjSng.enge, cObjMul, classOrder = c.order)

Fig 18: Filtered

Only detected duplicates and triplicates.
ERCC estimated cell number max = 4.
Weight cutoff = 10.

adj <- adjustFractions(cObjSng.enge, cObjMul, sObj, binary = TRUE)
samples <- rownames(adj)
rs <- rowSums(adj)
keep <- rs == 2 | rs == 3 | rs == 4


plotSwarmCircos(
  filterSwarm(sObj, keep), cObjSng.enge, cObjMul, weightCut = 10, 
  classOrder = c.order, theoretical.max = 4)